#!/usr/bin/python
#coding: utf-8
import sys
import csv
import re # library for pattern matching


semicolon = ";"
comma = ","
doublequote = '"'
singlequote = '\''
dash = "-"

def get_new_col_idx(input_file):
    new_col_idx = 0 
    with open(input_file, 'rb') as f:
        reader = csv.reader(f, delimiter = comma,  quotechar = singlequote,)
        for row in reader:
		    if (new_col_idx < len(row)):
			    new_col_idx = len(row)	
	return new_col_idx			
	
def translate_file(input_file, idx, output_file):
    rows = []    
    with open(input_file, 'rb') as f:
        reader = csv.reader(f, delimiter = comma,  quotechar = singlequote,)
        for row in reader:
            cfg = row[idx]
            cfg = cfg.replace(' ', '')
            aglycon = get_aglycon(cfg)
            #print cfg, aglycon

            aglycon_pos = re.search(aglycon, cfg).start()
            #print aglycon_pos
            if aglycon_pos != 0:
                cfg_short = cfg[0:aglycon_pos]
            else:
                cfg_short = cfg
            #print cfg_short
            
            iupac_short = translate_entry(cfg_short)
            aglycon = aglycon[1:]
            iupac = iupac_short + aglycon
            print iupac
        
            row.append(iupac)
            row.append(iupac_short)
            row.append(aglycon)                        
            
            #print row
            rows.append(row)    

    return rows

def get_aglycon(cfg_seq):
    pattern_r = '-R'
    pattern_olinked = '-Ser/Thr'
    pattern_cer = '-Cer'
    pattern_spacer = r'(\-|\xe2\x80\x93)*Sp[0-9]*' # pattern to replace all the "-SpXX"    

    r = re.search(pattern_r, cfg_seq)
    olinked = re.search(pattern_olinked, cfg_seq)
    cer = re.search(pattern_cer, cfg_seq)
    spacer = re.search(pattern_spacer, cfg_seq)
    
    aglycon =""    

    if r != None:        
        aglycon = r.group(0)
    if olinked != None:
        aglycon = olinked.group(0)
    if cer != None:
        aglycon = cer.group(0)
    if spacer != None:
        aglycon = spacer.group(0)    

    return aglycon#[1:]
    #print aglycon

def translate_entry(cfg_short):
    #print cfg_short
    links= re.compile(r'(.[0-9|?]\-[0-9|?])')
    init_link= re.compile(r'(.+?[^(])([ab][0-9|?])$')
    

    iupac_short = cfg_short.replace('(','[').replace(')',']')
    #print iupac_short
    iupac_short = iupac_short.replace('α', 'a')
    iupac_short = iupac_short.replace('β', 'b')
    
    #print iupac_short
    iupac_short = links.sub(r'(\1)', iupac_short)
    iupac_short = init_link.sub(r'\1(\2-?)', iupac_short)
    #iupac_short = re.sub(sulfur, "", iupac_short) 
    iupac_short = iupac_short.replace('[]', '')
    #iupac_short = re.sub(pattern_initial, '', iupac_short)
    #print iupac_short
    return iupac_short

def write_to_file(data,output_file_path):
    with open(output_file_path, 'wb') as csvfile: 
        data_writer = csv.writer(csvfile, delimiter=semicolon,
                                quotechar=doublequote)#, quoting=csv.QUOTE_MINIMAL)
        for row in data:
            data_writer.writerow(row)
            #output_file.write(data)


def main():
    input_file = sys.argv[1]
    input_column_idx = sys.argv[2]
    input_column_idx = int(input_column_idx)
    output_file = sys.argv[3]
     
    new_col_idx = get_new_col_idx(input_file)
    print "new_col_idx : " + str(new_col_idx)

    data = translate_file(input_file, input_column_idx, output_file)
	
    write_to_file(data,output_file)
   
if __name__ == '__main__':
    main()
